In [180]:
%%bash
cat /proc/cpuinfo | grep 'processor\|model name'
In [179]:
%%bash
free -g
In [52]:
from __future__ import print_function
import pandas as pd
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
from ipywidgets.widgets import interact, Text
from IPython.display import display
import numpy as np
In [4]:
# use the notebook definition for interactive embedded graphics
# %matplotlib notebook
# use the inline definition for static embedded graphics
%matplotlib inline
rcParam = {
'figure.figsize': (12,6),
'font.weight': 'bold',
'axes.labelsize': 20.0,
'axes.titlesize': 20.0,
'axes.titleweight': 'bold',
'legend.fontsize': 14,
'xtick.labelsize': 14,
'ytick.labelsize': 14,
}
for key in rcParam:
mpl.rcParams[key] = rcParam[key]
In [6]:
cbs_data = pd.read_csv('combined_data.csv',sep=',',na_values=['NA','.'],error_bad_lines=False);
Let's inspect the contents of this file by looking at the first 5 rows.
As you can see, this file has a lot of columns. For a description of the fieldnames, please see the description file
In [7]:
cbs_data.head()
Out[7]:
In [8]:
cbs_data_2015 = cbs_data.loc[cbs_data['YEAR'] == 2015];
#list(cbs_data_2015)
We will subset the entire 2010-2015 into just the year 2015.
In the table below you will see summary statistics
In [182]:
cbs_data_2015.describe()
#cbs_data_2015.YEAR.describe()
Out[182]:
In [14]:
cbs_data_2015 = cbs_data_2015.dropna();
cbs_data_2015.describe()
Out[14]:
Description of some of the demographic features of this dataset
In [15]:
cbs_data_2015.iloc[:,35:216].describe()
Out[15]:
We want to make a label and a set of features out of our data
Labelling: The relative amount of money and property crimes ( Vermogensmisdrijven_rel)
Features : All neighbourhood demographic columns in the dataset
In [156]:
labels = cbs_data_2015["Vermogensmisdrijven_rel"].values
columns = list(cbs_data_2015.iloc[:,37:215])
In [157]:
features = cbs_data_2015[list(columns)];
features = features.apply(lambda columns : pd.to_numeric(columns, errors='ignore'))
Inspect our labels and features
In [158]:
print(labels[1:10])
features.head()
Out[158]:
In [159]:
from sklearn.linear_model import RandomizedLasso
Run Randomized Lasso, with 3000 resampling and 100 iterations.
In [160]:
rlasso = RandomizedLasso(alpha='aic',verbose =True,normalize =True,n_resampling=3000,max_iter=100)
rlasso.fit(features, labels)
Out[160]:
In [189]:
dfResults = pd.DataFrame.from_dict(sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), list(features)), reverse=True))
dfResults.columns = ['Score', 'FeatureName']
dfResults.head(10)
Out[189]:
Because in the beginning of the lasso results table, a lot of high-scoring features are present, we want to check how the scores are devided across all features
In [186]:
dfResults.plot('FeatureName', 'Score', kind='bar', color='navy')
ax1 = plt.axes()
x_axis = ax1.axes.get_xaxis()
x_axis.set_visible(False)
plt.show()
In [170]:
plt.scatter(y=pd.to_numeric(cbs_data_2015['Vermogensmisdrijven_rel']),x=pd.to_numeric(cbs_data_2015['A_BED_GI']));
plt.ylabel('Vermogensmisdrijven_rel')
plt.xlabel('A_BED_GI ( Bedrijfsvestigingen: Handel en horeca )')
plt.show()
In [188]:
dfResults.tail(10)
Out[188]:
Let's also inspect one of the worst variables (Perc% of Low income households) and plot this one too
In [183]:
plt.scatter(y=pd.to_numeric(cbs_data_2015['Vermogensmisdrijven_rel']),x=pd.to_numeric(cbs_data_2015['P_LAAGINKH']));
plt.ylabel('Vermogensmisdrijven_rel')
plt.xlabel('Perc. Laaginkomen Huish.')
plt.show()
Try-out another hypothese (e.g. Perc% of divorced vs. Rel% Domestic and Sexual violence crimes)
In [184]:
plt.scatter(y=pd.to_numeric(cbs_data_2015['Gewelds_en_seksuele_misdrijven_rel']),x=pd.to_numeric(cbs_data_2015['P_GESCHEID']));
plt.ylabel('Gewelds_en_seksuele_misdrijven_rel')
plt.xlabel('Perc_Gescheiden')
plt.show()